import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style(style='darkgrid')
# Remove scientific notations and display numbers with 2 decimal points instead
pd.options.display.float_format = '{:,.2f}'.format
df = pd.read_csv('concrete.csv')
df.head(10)
#Exploratory data quality report
#1. Univariate analysis
#data types, Non null values
df.info()
# 9 columns and 1030 rows
# All rows are non-null
#statistical summary - range of values observed, central values (mean and median), standard deviation and quartiles - Q1 (25%), Q2 (50%), Q3 (75%)
df.describe().transpose()
##Mean < Medaian (Q2-50%) => -ve skew
#strength
#fineagg
##Mean > Medaian (Q2-50%) => +ve skew
#slag
#ash
#age
#Number of Unique values
df.nunique()
#Get all non-nulls - missing values check
df[~df.isna()]
#Incorrect Imputations
df[~df.applymap(np.isreal).all(1)]
#analysis of the body of distributions / tails; Check for outliers
df.hist(stacked=False, bins=100, figsize=(12,30), layout=(14,2));
plt.figure(figsize=(15,10))
pos = 1
for i in df.drop(columns = 'strength').columns:
plt.subplot(3, 3, pos)
sns.boxplot(df[i])
pos += 1
#Age has Outliers
#Water and superplastic also have outliers
#slag and fineagg also have few Outliers
import pandas_profiling
#Getting the pandas profiling report and check for incorrect imputation
pandas_profiling.ProfileReport(df)
#There are 25 duplicate rows
#slag has 471 (45.7%) zeros
#ash has 566 (55.0%) zeros
#superplastic has 379 (36.8%) zeros
#845 distinct values of Strength
#Drop duplicates
df = df.drop_duplicates()
#2. Bi-variate analysis
#corelation between the predictor variables and between the predictor variables and target column
df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),
annot=True,
linewidths=.5,
center=0,
cbar=False,
cmap="YlGnBu")
plt.show()
#superplastic is positively correlated with ash
#superplastic is negtively correlated with water
#fineagg is slightly negtively correlated with water
#Strength is positively corelated with cement
#Strength is slightly positively corelated with superplastic and age
sns.pairplot(df,diag_kind='kde');
#3. Feature Engineering techniques
#a. Identify opportunities (if any) to extract a new feature from existing features
#Polynomial Features are added below
#Ash is the feature that can be removed as it has very low correlation with Strength.
#Features is not removed as the desired R2 score is achived without removal of feature.
#b. Get data model ready and do a train test split
X = df.drop('strength' , axis=1)
y = df['strength']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=7)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
X_train.head()
#c. Check for higher degree attributes, should it be linear, quadratic or higher degree? Use Polynomial Features (Consider degree 2 and 3)
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
#degree 1 Polynomial Feature
poly1 = PolynomialFeatures(degree=1, interaction_only=True)
X_train1 = poly1.fit_transform(X_train)
X_test1 = poly1.fit_transform(X_test)
poly1_clf = linear_model.LinearRegression()
poly1_clf.fit(X_train1, y_train)
y1_pred = poly1_clf.predict(X_test1)
#In sample (training) R^2 will always improve with the number of variables!
print("Training score of degree 1 Polynomial Features",poly1_clf.score(X_train1, y_train))
#Out off sample (testing) R^2 is our measure of sucess and does improve
print("Testing score of degree 1 Polynomial Features", poly1_clf.score(X_test1, y_test))
#degree 2 Polynomial Feature
poly2 = PolynomialFeatures(degree=2, interaction_only=True)
X_train2 = poly2.fit_transform(X_train)
X_test2 = poly2.fit_transform(X_test)
poly2_clf = linear_model.LinearRegression()
poly2_clf.fit(X_train2, y_train)
y2_pred = poly2_clf.predict(X_test2)
#print(y2_pred)
#In sample (training) R^2 will always improve with the number of variables!
print("Training score of degree 2 Polynomial Features",poly2_clf.score(X_train2, y_train))
#Out off sample (testing) R^2 is our measure of sucess and does improve
print("Testing score of degree 2 Polynomial Features", poly2_clf.score(X_test2, y_test))
#Degree2 is much better than Degree1 Polynomial Feature. Lets's check degree 3
#degree 3 Polynomial Feature
poly3 = PolynomialFeatures(degree=3, interaction_only=True)
X_train3 = poly3.fit_transform(X_train)
X_test3 = poly3.fit_transform(X_test)
poly_clf3 = linear_model.LinearRegression()
poly_clf3.fit(X_train3, y_train)
y_pred3 = poly_clf3.predict(X_test3)
#In sample (training) R^2 will always improve with the number of variables!
print("Training score of degree 3 Polynomial Features", poly_clf3.score(X_train3, y_train))
#Out off sample (testing) R^2 is our measure of sucess and does improve
print("Testing score of degree 3 Polynomial Features", poly_clf3.score(X_test3, y_test))
# Degree 3 Polynomial Feature is over fit. Testing scorr of degree2 is better than degree3.
#So the best Polynominal feature is degree 2
#Shapes
print("Shape of Original Training Set", X_train.shape)
print("Shape of Degree 1 Training Set", X_train1.shape)
print("Shape of Degree 2 Training Set", X_train2.shape)
print("Shape of Degree 3 Training Set", X_train3.shape)
#Creating the model and tuning it
model= []
trainscore = []
testscore = []
rmse = []
# Blanks list to store model name, training score, testing score, RMSE (Root Mean Square Error)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
#Use Kfold Cross Validation to evaluate model performance
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
scores = cross_val_score(poly2_clf, X_train2, y_train, cv=num_folds)
scores.mean()
print("Accuracy: %.3f%% (%.3f%%)" % (scores.mean()*100.0, scores.std()*100.0))
#Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor
dTree = DecisionTreeRegressor()
dTree.fit(X_train2, y_train)
print(dTree.score(X_test2, y_test))
model.append('Decision Tree')
trainscore.append(dTree.score(X_train2, y_train))
testscore.append(dTree.score(X_test2, y_test))
rmse.append(mean_squared_error(y_test,dTree.predict(X_test2))**0.5)
#Pruned Decision Tree Regression
dTreeR = DecisionTreeRegressor(random_state = 7, max_depth=25, min_samples_leaf=5)
dTreeR.fit(X_train2, y_train)
print(dTreeR.score(X_test2, y_test))
model.append('Decision Tree Pruned')
trainscore.append(dTreeR.score(X_train2, y_train))
testscore.append(dTreeR.score(X_test2, y_test))
rmse.append(mean_squared_error(y_test,dTreeR.predict(X_test2))**0.5)
#Bagging Regression
from sklearn.ensemble import BaggingRegressor
bgcl = BaggingRegressor(base_estimator=poly2_clf, n_estimators=200,random_state=7)
bgcl = bgcl.fit(X_train2, y_train)
bgcl_predict = bgcl.predict(X_test2)
print(bgcl.score(X_test2 , y_test))
model.append('Bagging')
trainscore.append(bgcl.score(X_train2, y_train))
testscore.append(bgcl.score(X_test2, y_test))
rmse.append(mean_squared_error(y_test,bgcl.predict(X_test2))**0.5)
# AdaBoost Regression
from sklearn.ensemble import AdaBoostRegressor
ada = AdaBoostRegressor(base_estimator=poly2_clf,random_state=7,n_estimators= 200, learning_rate=0.1)
ada.fit(X_train2, y_train)
ada_predict = ada.predict(X_test2)
print(ada.score(X_test2 , y_test))
model.append('Ada Boosting')
trainscore.append(ada.score(X_train2, y_train))
testscore.append(ada.score(X_test2, y_test))
rmse.append(mean_squared_error(y_test,ada.predict(X_test2))**0.5)
# Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor
grad = GradientBoostingRegressor(random_state=7, n_estimators=200)
grad.fit(X_train2, y_train)
grad_predict = grad.predict(X_test2)
print(grad.score(X_test2 , y_test))
model.append('Gradient Boosting')
trainscore.append(grad.score(X_train2, y_train))
testscore.append(grad.score(X_test2, y_test))
rmse.append(mean_squared_error(y_test,grad.predict(X_test2))**0.5)
# Random Forest
from sklearn.ensemble import RandomForestRegressor
rand = RandomForestRegressor(random_state=7, n_estimators=200)
rand.fit(X_train2, y_train)
rand_predict = grad.predict(X_test2)
print(rand.score(X_test2 , y_test))
model.append('Random Forest')
trainscore.append(rand.score(X_train2, y_train))
testscore.append(rand.score(X_test2, y_test))
rmse.append(mean_squared_error(y_test,rand.predict(X_test2))**0.5)
# DataFrame to compare results.
results = pd.DataFrame()
results['Model'] = model
results['Training Score'] = trainscore
results['Testing Score'] = testscore
results['Root Mean Square Error'] = rmse
results = results.set_index('Model')
results
#2 Best models are Gradient Boosting and Random Forest. R2 score is high and RMSE is low.
#Kfold Cross Validation to evaluate Gradient Boosting model performance
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
scores = cross_val_score(grad, X_train2, y_train, cv=num_folds)
scores.mean()
print("Accuracy after cross validation for Gradient Boosting: %.3f%% (%.3f%%)" % (scores.mean()*100.0, scores.std()*100.0))
#Kfold Cross Validation to evaluate Random Forest model performance
scores1 = cross_val_score(rand, X_train2, y_train, cv=num_folds)
scores1.mean()
print("Accuracy after cross validation for Random Forest: %.3f%% (%.3f%%)" % (scores1.mean()*100.0, scores1.std()*100.0))
#Get Gradient boosting params
grad.get_params()
#Get Random Forest params
rand.get_params()
h_model= []
h_trainscore = []
h_testscore = []
h_rmse = []
# Blanks list to store model name, training score, testing score, RMSE after hyperparameter tuning
# I have experimented Random Search and Grid Serach.
#Random Search had better results and faster run time. So I choose Random Search and displaying only Random Search results.
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from time import time
# specify parameters and distributions to sample for Random Forest
param_dist = {"max_depth": [3, 7, 25, None],
"max_features": sp_randint(1, 11),
"min_samples_split": sp_randint(2, 11),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True, False]}
# run randomized search for Random Forest
samples = 10 # number of random samples
randomCV = RandomizedSearchCV(rand, param_distributions=param_dist, n_iter=samples)
randomCV.fit(X_train2, y_train)
print(randomCV.best_params_)
#Storing Scores
h_model.append('RandomizedSearchCV Random Forest')
h_trainscore.append(randomCV.score(X_train2, y_train))
h_testscore.append(randomCV.score(X_test2, y_test))
h_rmse.append(mean_squared_error(y_test,randomCV.predict(X_test2))**0.5)
parameters = {
"learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.25, 0.5],
"min_samples_split": [2, 3, 5, 6, 9, 10],
"min_samples_leaf": [1, 2, 3, 6, 9, 10],
"max_depth":[3,5,8],
"max_features":["log2","sqrt"],
"criterion": ["friedman_mse", "mae"],
"subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
"n_estimators":[10, 100, 200, 500]
}
rand_grad = RandomizedSearchCV(grad, param_distributions=parameters, cv=10, n_jobs=-1)
rand_grad.fit(X_train2, y_train)
print(rand_grad.score(X_train2, y_train))
print(rand_grad.best_params_)
h_model.append('RandomizedSearchCV Gradient boosting')
h_trainscore.append(rand_grad.score(X_train2, y_train))
h_testscore.append(rand_grad.score(X_test2, y_test))
h_rmse.append(mean_squared_error(y_test,rand_grad.predict(X_test2))**0.5)
Rand_results = pd.DataFrame()
Rand_results['Model'] = h_model
Rand_results['Training Score'] = h_trainscore
Rand_results['Testing Score'] = h_testscore
Rand_results['Root Mean Square Error'] = h_rmse
Rand_results = Rand_results.set_index('Model')
Rand_results
##RandomizedSearchCV of Gradient boosting is the best model.
#Training Score of 98%,
#Testing Score of 93%
#RMSE OF 4.27